## load required libraries
library(tidyverse)
library(quanteda)
library(lexicon)
library(reshape2)
library(stringi)
library(quanteda.textplots)
library(quanteda.textmodels)
library(quanteda.textstats)
library(gridExtra)
library(seededlda)
library(ggrepel)
library(ggdendro)
library(factoextra)
library(lattice)
library(spacyr)## clean workspace
rm(list=ls())## set working directory (WD)
path <- '~/coliphi21/practice_lessons/lesson_2/src/'
## you can also set it dynamically:
## setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
setwd(path)## check that WD is set correctly
getwd()## [1] "/Users/lucienbaumgartner/coliphi21/practice_lessons/lesson_2/src"
For this tutorial you can either work with your own data, or the pre-built copora provided in the /input-folder for the first practice session. The quanteda-package also contains pre-built corpora you can use. For this session, we scraped the Stanford Encyclopedia of Philosophy and built a corpus including additional metadata.
## relative path
load('../input/stanford-encyclopedia.RDS')## absolute path
load('~/coliphi21/practice_lessons/lesson_2/input/stanford-encyclopedia.RDS')Loading the data above will import a pre-built corpus object into R, which is called sfe.
## how does the corpus object look like?
sfe## Corpus consisting of 1,712 documents and 21 docvars.
## 18thGerman-preKant.json :
## " In Germany, the eighteenth century was the age of enlighten..."
##
## abduction.json :
## " In the philosophical literature, the term abduction is used..."
##
## abelard.json :
## " Peter Abelard (1079–21 April 1142) [Abailard or Abaelard or..."
##
## abhidharma.json :
## " The first centuries after Śākyamuni Buddha death saw the ri..."
##
## abilities.json :
## " In the accounts we give of one another, claims about our ab..."
##
## abner-burgos.json :
## " Abner of Burgos (Alfonso de Valladolid; c. 1260–1347) was p..."
##
## [ reached max_ndoc ... 1,706 more documents ]
## summary statistics
summary(sfe) %>% head## available variables
docvars(sfe)Familiarize yourself a little more with the data.
## tokenization
toks <- tokens(sfe, what = 'word',
remove_punct = T, remove_symbols = T, padding = F,
remove_numbers = T, remove_url = T)
## to lower
toks <- tokens_tolower(toks)
## lemmatizing
toks <- tokens_replace(toks,
pattern = lexicon::hash_lemmas$token,
replacement = lexicon::hash_lemmas$lemma)
## remove stopwords
toks <- tokens_select(toks, pattern = stopwords("en"), selection = "remove")
## remove noise
toks <- tokens_select(toks, pattern = '^[A-z]$|[0-9]+|^.$', valuetype = 'regex', selection = 'remove')
## create dfm
dfm_sfe <- dfm(toks) %>%
dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")dfm_sfe## Document-feature matrix of: 1,712 documents, 24,689 features (98.47% sparse) and 21 docvars.
## features
## docs ethos immanuel thomasius pietist thomasians wolff well dis halle pietism
## 18thGerman-preKant.json 2 1 33 6 11 36 0 1 19 7
## abduction.json 0 0 0 0 0 0 0 0 0 0
## abelard.json 0 0 0 0 0 0 0 0 0 0
## abhidharma.json 0 0 0 0 0 0 0 0 0 0
## abilities.json 0 0 0 0 0 0 0 0 0 0
## abner-burgos.json 0 0 0 0 0 0 0 0 0 0
## [ reached max_ndoc ... 1,706 more documents, reached max_nfeat ... 24,679 more features ]
Check whether there is still some noise in the data and remove it. Hint: Scan through the topfeatures.
## remove phi
toks <- tokens_select(toks, pattern = 'φ', valuetype = 'regex', selection = 'remove')
## create dfm
dfm_sfe <- dfm(toks) %>%
dfm_trim(min_termfreq = 0.8, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")## compute model
sfe_ca <- textmodel_ca(dfm_sfe)## coerce model coefficients to dataframe
sfe_ca <- data.frame(dim1 = coef(sfe_ca, doc_dim = 1)$coef_document,
dim2 = coef(sfe_ca, doc_dim = 2)$coef_document)
sfe_ca$id <- gsub('\\.json(\\.[0-9])?', '', rownames(sfe_ca))
sfe_ca## plot full data with branch annotation
ggplot(sfe_ca, aes(x=dim1, y=dim2, label=id)) +
geom_point(aes(color=dim1-dim2), alpha = 0.2) +
# plot 0.2 of all labels, using a repel function
geom_text_repel(data = dplyr::sample_frac(sfe_ca, 0.2), max.overlaps = 15, seed = 6734) +
theme_bw() +
theme(plot.title = element_text(face='bold')) +
labs(title = 'Correspondence Analysis: Full Data')## plot parts of the data
ggplot(sfe_ca, aes(x=dim1, y=dim2, label=id)) +
geom_point(aes(color=dim1-dim2), alpha = 0.2) +
# plot 0.2 of all labels, using a repel function
geom_text_repel(data = dplyr::sample_frac(sfe_ca, 0.2), max.overlaps = 9, seed = 6734) +
scale_y_continuous(limits=c(-2,0)) +
scale_x_continuous(limits=c(-1,1)) +
theme_bw() +
theme(plot.title = element_text(face='bold')) +
labs(title = 'Correspondence Analysis: Zoom')## run naive unsupervised topic model with 10 topics
set.seed(123)
sfe_lda <- textmodel_lda(dfm_sfe, k = 10)## print top 20 terms per topic
terms(sfe_lda, 20)## topic1 topic2 topic3 topic4 topic5 topic6 topic7 topic8 topic9 topic10
## [1,] "supervenience" "gene" "disability" "turing" "spacetime" "ockham" "chinese" "heidegger" "privacy" "ibn"
## [2,] "trope" "molecular" "oppression" "gödel" "einstein" "bacon" "spinoza" "dewey" "theism" "avicenna"
## [3,] "monism" "neural" "african" "algebra" "kuhn" "scotus" "reid" "husserl" "torture" "buddhist"
## [4,] "fictional" "simulation" "racial" "intuitionistic" "popper" "pythagoras" "dao" "du" "user" "maimonides"
## [5,] "bolzano" "dna" "coercion" "computation" "reichenbach" "boethius" "nietzsche" "malebranche" "clinical" "arabic"
## [6,] "brentano" "darwin" "feminism" "ordinal" "weyl" "pythagorean" "confucian" "artist" "whitehead" "averroes"
## [7,] "goodman" "fitness" "capability" "hilbert" "gravity" "parmenides" "sidgwick" "berlin" "delusion" "japanese"
## [8,] "physicalism" "inheritance" "domination" "cardinal" "hole" "proclus" "utilitarianism" "artistic" "theist" "buddha"
## [9,] "strawson" "fodor" "pornography" "tarski" "ramsey" "cicero" "mohists" "spinoza" "theistic" "buddhism"
## [10,] "noun" "ai" "distributive" "peirce" "bayesian" "philo" "conscience" "wolff" "hartshorne" "dharma"
## [11,] "bradley" "artifact" "marx" "algorithm" "entropy" "plotinus" "consequentialist" "nietzsche" "enhancement" "islamic"
## [12,] "austin" "imagery" "egalitarian" "algebraic" "bohr" "sextus" "luck" "herder" "doxastic" "indian"
## [13,] "armstrong" "biologist" "dworkin" "recursive" "newtonian" "porphyry" "laozi" "clarke" "omnipotent" "emptiness"
## [14,] "chisholm" "drift" "coercive" "brouwer" "payoff" "abelard" "zhuangzi" "sartre" "embryo" "mystical"
## [15,] "entailment" "biodiversity" "sovereign" "provable" "feyerabend" "bce" "zhu" "romantic" "suicide" "nishida"
## [16,] "plural" "cancer" "income" "zfc" "dynamical" "iamblichus" "thick" "bois" "internalism" "japan"
## [17,] "intension" "quale" "constitutional" "computable" "bell" "fr" "hutcheson" "artwork" "gratitude" "zen"
## [18,] "implicature" "adaptation" "liberalism" "cantor" "gas" "luther" "confucius" "fichte" "engine" "hebrew"
## [19,] "grice" "genome" "anderson" "diagram" "mach" "timaeus" "relativism" "schopenhauer" "csm" "vasubandhu"
## [20,] "meinong" "digital" "republican" "definable" "bet" "sophist" "wang" "collins" "goldman" "al-fārābī"
## plot the topics over the correspondence analysis data
sfe_ca$topics <- topics(sfe_lda)
ggplot(sfe_ca, aes(x=dim1, y=dim2, color=topics)) +
geom_point(alpha = 0.5, shape = '.') +
geom_density_2d(alpha = 0.5) +
theme_bw() +
theme(plot.title = element_text(face='bold')) +
labs(title = 'Correspondence Analysis with Topic Annotation (k=10)')Change the names of the topics (to some meaningful description) before plotting.
sfe_ca$topics <- recode(sfe_ca$topics, topic1 = "body-mind", topic2 = "biology",
topic3 = "feminism/critical thinking", topic4 = "math/ai",
topic5 = "physics", topic6 = "classics", topic7 = "eastern",
topic8 = "phenomenology", topic9 = "religion",
topic10 = "middle-eastern/eastern")## set seed
set.seed(48621)
## draw a random sample of 20 documents
sfe_sub <- sfe[sample(1:length(sfe), 5)]
sfe_sub## Corpus consisting of 5 documents and 21 docvars.
## albert-saxony.json :
## " Albert of Saxony (ca. 1320–1390), Master of Arts at Paris, ..."
##
## contractarianism.json.1 :
## " Contractarianism names both a political theory of the legit..."
##
## preferences.json.1 :
## " The notion of preference has a central role in many discipl..."
##
## plotinus.json :
## " Plotinus (204/5 – 270 C.E.), is generally regarded as the f..."
##
## paternalism.json :
## " Paternalism is the interference of a state or an individual..."
## PoS-tagging
sfe_pos <- spacy_parse(sfe_sub, pos = T, tag = T, lemma = T, entity = T, dependency = T)
sfe_pos## look up which adjectives are used most frequently
sfe_pos %>%
filter(pos == 'ADJ') %>%
group_by(token) %>%
summarise(n.occurences = n()) %>%
arrange(desc(n.occurences))## look up which nouns are preceded by the adjective "rational"
rational_noun <- sfe_pos %>% filter(pos == 'NOUN' & lag(token, 1) == 'rational')
rational_noun# get top 2 nouns per document
rational_noun %>%
group_by(doc_id, token) %>%
summarise(n.occurences = n()) %>%
arrange(doc_id, desc(n.occurences)) %>%
slice(1:2)## to create a corpus-object from your pos-tagged tokens
## we need unique IDs
sfe_pos## make doc_ids unique
sfe_pos <- mutate(sfe_pos, doc_id = make.unique(doc_id))
## remove punctuation and spaces
sfe_pos <- filter(sfe_pos, !pos %in% c('PUNCT', 'SPACE'))
## make token corpus
sfe_pos <- corpus(sfe_pos, text_field = 'token', docid_field = 'doc_id')
sfe_pos## Corpus consisting of 35,214 documents and 8 docvars.
## albert-saxony.json.1 :
## "Albert"
##
## albert-saxony.json.2 :
## "of"
##
## albert-saxony.json.3 :
## "Saxony"
##
## albert-saxony.json.5 :
## "ca"
##
## albert-saxony.json.6 :
## "."
##
## albert-saxony.json.7 :
## "1320–1390"
##
## [ reached max_ndoc ... 35,208 more documents ]
docvars(sfe_pos)## WARNING! This data-structure is incompatible with our document-based corpus!!!
docvars(sfe_sub)## ... but we can add the info to our token corpus
# add initial document ID to both sets of docvars
docvars(sfe_pos)$initial_docid <- gsub('\\.json.*', '', docid(sfe_pos))
docvars(sfe_sub)$initial_docid <- gsub('\\.json.*', '', docid(sfe_sub))
# join by initial id
docvars(sfe_pos) <- left_join(docvars(sfe_pos), docvars(sfe_sub), by = 'initial_docid')
docvars(sfe_pos)## Keep in mind: your corpus is still on token level!## hierarchical clustering - get distances on normalized dfm
sfe_dist_mat <- dfm_weight(dfm_sfe, scheme = "prop") %>%
textstat_dist(method = "euclidean") %>%
as.dist()
## hiarchical clustering the distance object
sfe_cluster <- hclust(sfe_dist_mat, method = 'ward.D')
# label with document names
sfe_cluster$labels <- gsub('\\.json(\\.[0-9])?', '', docnames(dfm_sfe))
## determine best numbers of clusters
# fviz_nbclust(as.matrix(sfe_dist_mat), FUN = hcut, method = "wss")
## cut tree into four groups
clusters <- cutree(sfe_cluster, k = 4)
## add cluster-data to the correspondence analysis
sfe_ca_hcl <- left_join(sfe_ca, data.frame(cluster = clusters, id = names(clusters)))
## plot
ggplot(sfe_ca_hcl, aes(x=dim1, y=dim2, label=id)) +
geom_point(aes(color=as.factor(cluster)), alpha = 0.2) +
facet_grid(~as.factor(cluster))## hierarchical clustering doesn't provide discrete cluster along
## the dimensions of the correspondance analysis## subset documents about logic
logic <- dfm_subset(dfm_sfe, grepl('(?<=\\-)logic|logic(?=\\-)', docnames(dfm_sfe), perl = T))
## compute cosine similarity
logic_sim <- textstat_simil(logic, margin = 'document', method = 'cosine')
## all pairs with a cosine similarity > .4
as.data.frame(logic_sim) %>%
filter(cosine > .4) %>%
arrange(desc(cosine))Redo the cosine similarities for another subset of documents.
## subset documents about aesthetics
aesth <- dfm_subset(dfm_sfe, grepl('aesthetics', docnames(dfm_sfe), perl = T))
## compute cosine similarity
aesth <- textstat_simil(aesth, margin = 'document', method = 'cosine')
## all pairs with a cosine similarity > .2
as.data.frame(aesth) %>%
filter(cosine > .2) %>%
arrange(desc(cosine))## subset documents about feminism
fem <- dfm_subset(dfm_sfe, grepl('(?<=\\-)fem|fem.*(?=\\-)', docnames(dfm_sfe), perl = T))
## compute cosine similarities for the features
## "empowerment", "embodiment", and "rape"
fem_sim <- textstat_simil(logic, logic[, c("empowerment", "embodiment", "rape")],
margin = 'feature', method = 'cosine')
## top 5 results per feature
as.data.frame(fem_sim) %>%
group_by(feature2) %>%
arrange(feature2, desc(cosine)) %>%
slice_head(n=5)Redo the cosine similarities for a different set of features.
fem_sim <- textstat_simil(logic, logic[, c("feminism", "patriarchy")],
margin = 'feature', method = 'cosine')
## top 5 results per feature
as.data.frame(fem_sim) %>%
group_by(feature2) %>%
arrange(feature2, desc(cosine)) %>%
slice_head(n=5)A work by Lucien Baumgartner & Kevin Reuter
lucien.baumgartner@philos.uzh.ch
https://lucienbaumgartner.github.io/" class="fa fa-home">